import pandas as pd
import numpy as np
import seaborn as sns
np.random.seed(44)
from google.colab import drive
drive.mount("/content/gdrive")
train = pd.read_csv('/content/gdrive/My Drive/3253 Machine Learning Term Project/train.csv')
test = pd.read_csv('/content/gdrive/My Drive/3253 Machine Learning Term Project/test.csv')
print(train.shape)
train.shape
train.info()
#count the unique values in "Target", which is the prediction we are going to make
train["Target"].value_counts()
#Check if the data is balanced
%matplotlib inline
import matplotlib.pyplot as plt
def check_data_balance(series, style="seaborn-pastel"):
with plt.style.context(style):
unique = series.value_counts()
display(unique) #show unique value counts of the target
plt.pie(unique, explode=[0.05]*len(unique), labels=unique.index, autopct='%1.1f%%'); #plot a pie chart for the target to see if data are balanced
check_data_balance(train["Target"])
#inspect the data using the dataframe's describe() function
train.describe()
%matplotlib inline
import matplotlib.pyplot as plt
#draw histogram of each feature
train.hist(bins=50, figsize=(20,15))
#save_fig("attribute_histogram_plots")
plt.show()
import plotly.express as px
fig = px.scatter_matrix(train.iloc[:,1:8], height=1000)
fig.show()
fig = px.scatter_matrix(train.iloc[:,8:16], height=1000)
fig.show()
fig = px.scatter_matrix(train.iloc[:,16:24], height=1000)
fig.show()
fig = px.scatter_matrix(train.iloc[:,24:32], height=1000)
fig.show()
fig = px.scatter_matrix(train.iloc[:,32:37], height=1000)
fig.show()
len(train.columns)
# Check PreviousCampaignResult feature values - almost all of them are zero
train['PreviousCampaignResult'].value_counts()
# Check values for Product features
products = train[['Product1', 'Product2', 'Product3', 'Product4', 'Product5', 'Product6']]
#products
products.apply(pd.Series.value_counts)
# Products value counts in % contribution
products.apply(lambda x: pd.value_counts(x, normalize=True).mul(100).round(1).astype(str) + '%')
# Box plots for Transaction features
plt.subplot(331)
sns.boxplot(train["Transaction1"])
plt.subplot(332)
sns.boxplot(train["Transaction2"])
plt.subplot(333)
sns.boxplot(train["Transaction3"])
plt.subplot(334)
sns.boxplot(train["Transaction4"])
plt.subplot(335)
sns.boxplot(train["Transaction5"])
plt.subplot(336)
sns.boxplot(train["Transaction6"])
plt.subplot(337)
sns.boxplot(train["Transaction7"])
plt.subplot(338)
sns.boxplot(train["Transaction8"])
plt.subplot(339)
sns.boxplot(train["Transaction9"])
fig = plt.gcf()
fig.set_size_inches(10,10)
# Count non-zero values in each Transaction column
transactions = train[['Transaction1', 'Transaction2', 'Transaction3', 'Transaction4', 'Transaction5', 'Transaction6', 'Transaction7',
'Transaction8', 'Transaction9']]
# transactions
transactions.apply(lambda x: np.count_nonzero(x))
# Check values for External Account features
external_accounts = train[['ExternalAccount1', 'ExternalAccount2', 'ExternalAccount3', 'ExternalAccount4', 'ExternalAccount5', 'ExternalAccount6', 'ExternalAccount7']]
# external_accounts
external_accounts.apply(lambda x: pd.value_counts(x, normalize=True).mul(100).round(1).astype(str) + '%')
# Check values for Activity Indicator feature
sns.boxplot(train['ActivityIndicator'])
# Group values into buckets by percentile
p50 = np.percentile(train["ActivityIndicator"],50)
p75 = np.percentile(train["ActivityIndicator"],75)
p99 = np.percentile(train["ActivityIndicator"],99)
bins = [-1, 0, p50, p75, p99, np.inf]
train["ActivityIndicator"].value_counts(bins=bins, sort=False, normalize=True).mul(100).round(1).astype(str) + '%'
# Check target variable for customers with no activity
inactive = train[train['ActivityIndicator'] == 0]
inactive["Target"].value_counts()
inactive["Target"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
# Check target variable for customers with some activity
active = train[train['ActivityIndicator'] != 0]
active["Target"].value_counts()
active["Target"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
# Check values for Regular Interaction Indicator feature
sns.countplot(train["RegularInteractionIndicator"])
# Group values into buckets by percentile
p50 = np.percentile(train["RegularInteractionIndicator"],50)
p75 = np.percentile(train["RegularInteractionIndicator"],75)
p99 = np.percentile(train["RegularInteractionIndicator"],99)
bins = [-1, p50, p75, p99, np.inf]
train["RegularInteractionIndicator"].value_counts(bins=bins, sort=False, normalize=True).mul(100).round(1).astype(str) + '%'
# Check target variable for customers with zero interaction frequency score
infrequent = train[train['RegularInteractionIndicator'] == 0]
infrequent["Target"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
# Summarize rate offers
rates = train[['CompetitiveRate1', 'CompetitiveRate2', 'CompetitiveRate4', 'CompetitiveRate4', 'CompetitiveRate5', 'CompetitiveRate6', 'CompetitiveRate7',
'RateBefore', 'ReferenceRate']]
rates.apply(lambda x: pd.value_counts(x, normalize=True).mul(100).round(1).astype(str) + '%')
# Check if there's correlation between balance and activity/interaction indicators
train_corr = train[['Balance', 'ActivityIndicator', 'RegularInteractionIndicator']].copy()
# sns.pairplot(train_corr)
sns.heatmap(train_corr.corr(),
annot=True,
linewidth=.5,
center = 0,
fmt='.1g',
cbar=False,
cmap='GnBu')
Observations: